package com.jlt.baidu.utils;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.bytedeco.javacpp.BytePointer;
import org.bytedeco.leptonica.PIX;
import org.bytedeco.leptonica.global.lept;
import org.bytedeco.tesseract.TessBaseAPI;

import com.jlt.ocr.tesseract.OcrRectangle;
import com.jlt.ocr.tesseract.OcrResult;

import lombok.extern.slf4j.Slf4j;

/**
 * 针对识别的结果进行细节处理
 * 
 * @author Ives.Chen
 *
 */
@Slf4j
public class OcrResultHandleUtil {

    /**
     * 通用图片文字识别核心代码
     * 
     * @param api
     * @param fileName
     * @param rangeList
     */
    public static List<OcrResult> commonImageOcr(TessBaseAPI api, String fileName, List<OcrRectangle> rangeList) {
        TreeMap<Integer, List<OcrRectangle>> rows = rectRowHandle(rangeList);
        List<OcrResult> result = new ArrayList<>();
        // ocr识别
        PIX image = lept.pixRead(fileName);
        api.SetImage(image);
        int rowIndex = 1;
        for (List<OcrRectangle> row : rows.values()) {
            OcrResult model = new OcrResult();
            model.setRow(rowIndex);
            StringBuilder ocrResult = new StringBuilder();
            // 排序 从左向右读取
            row.sort(Comparator.comparingInt(OcrRectangle::getLeft));
            for (OcrRectangle range : row) {
                try {
                    ocrCoreBusiness(api, range, ocrResult);
                } catch (Exception e) {
                    log.error("ocr识别区域文字报错,文件名:%s,第%d行", fileName, rowIndex);
                    continue;
                }
            }
            model.setWords(ocrResult.toString());
            rowIndex++;
            result.add(model);
        }
        lept.pixDestroy(image);
        return result;
    }

    /**
     * 单纯表格文字识别核心代码,表格读取重现重复读取的问题，对结果进行去除，数量为2，大于2偶数为业务重复，返回的行数是奇数位
     * 
     * @param api
     * @param fileName
     * @param rangeList
     */
    public static List<OcrResult> excelImageOcr(TessBaseAPI api, String fileName, List<OcrRectangle> rangeList) {
        TreeMap<Integer, List<OcrRectangle>> columns = rectColumnHandle(rangeList);
        // ocr识别
        PIX image = lept.pixRead(fileName);
        api.SetImage(image);
        int columnIndex = 1;
        Map<Integer, List<OcrResult>> resultMap = new HashMap<>();
        for (List<OcrRectangle> column : columns.values()) {
            List<OcrResult> result = new ArrayList<>();
            // 排序 从左向右读取
            column.sort(Comparator.comparingInt(OcrRectangle::getTop));
            int rowIndex = 1;
            for (OcrRectangle range : column) {
                OcrResult model = new OcrResult();
                model.setColumn(columnIndex);
                StringBuilder ocrResult = new StringBuilder();
                try {
                    model.setRow(rowIndex);
                    rowIndex++;
                    ocrCoreBusiness(api, range, ocrResult);
                } catch (Exception e) {
                    log.error("ocr识别区域文字报错,文件名:%s,第%d行", fileName, rowIndex);
                    continue;
                }
                model.setWords(ocrResult.toString());
                result.add(model);
            }
            resultMap.put(columnIndex, result);
            columnIndex++;
        }
        lept.pixDestroy(image);
        return excelRsultRepeatHandle(resultMap);
    }

    /**
     * 对excel识别的结果进行去重处理，0/1这样递进重复
     * 
     * @param resultMap
     * @return
     */
    private static List<OcrResult> excelRsultRepeatHandle(Map<Integer, List<OcrResult>> resultMap) {
        List<OcrResult> result = new ArrayList<>();
        for (List<OcrResult> list : resultMap.values()) {
            for (int i = 0; i < list.size(); i = i + 2) {
                result.add(list.get(i));
            }
        }
        return result;
    }

    /**
     * 对已经筛选出来的区域，进行分组，按行取值
     * 
     * @param rangeList
     */
    private static TreeMap<Integer, List<OcrRectangle>> rectRowHandle(List<OcrRectangle> rangeList) {
        // 对所有矩形块进行分行筛选
        TreeMap<Integer, List<OcrRectangle>> rows = new TreeMap<>();
        for (OcrRectangle tmp : rangeList) {
            boolean flag = true;
            for (Integer top : rows.keySet()) {
                // 高度相差20 判断为同一行
                if (Math.abs(top - tmp.getTop()) <= 20) {
                    rows.get(top).add(tmp);
                    flag = false;
                    break;
                }
            }
            if (flag) {
                List<OcrRectangle> row = new ArrayList<>();
                row.add(tmp);
                rows.put(tmp.getTop(), row);
            }
        }
        return rows;
    }

    /**
     * 对已经筛选出来的区域，进行分组，按列取值
     * 
     * @param rangeList
     */
    private static TreeMap<Integer, List<OcrRectangle>> rectColumnHandle(List<OcrRectangle> rangeList) {
        // 对所有矩形块进行分列筛选
        TreeMap<Integer, List<OcrRectangle>> columns = new TreeMap<>();
        for (OcrRectangle tmp : rangeList) {
            boolean flag = true;
            for (Integer left : columns.keySet()) {
                // 宽度相差25 判断为同一列
                if (Math.abs(left - tmp.getLeft()) <= 25) {
                    columns.get(left).add(tmp);
                    flag = false;
                    break;
                }
            }
            if (flag) {
                List<OcrRectangle> column = new ArrayList<>();
                column.add(tmp);
                columns.put(tmp.getLeft(), column);
            }
        }
        return columns;
    }

    /**
     * 核心代码，识别出文字内容
     * 
     * @param api
     * @param range
     * @param ocrResult
     */
    private static void ocrCoreBusiness(TessBaseAPI api, OcrRectangle range, StringBuilder ocrResult) {
        api.SetRectangle(range.getLeft(), range.getTop(), range.getWidth(), range.getHeight());
        api.Recognize(null);
        BytePointer outText = api.GetUTF8Text();
        if (outText != null) {
            String text = outText.getString().trim().replace("\n", "");
            outText.deallocate();
            ocrResult.append(text);
        }
    }
}
